import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sea
from math import pi
import numpy as np
import plotly.express as px
from scipy import stats
df = pd.read_csv('supermarket_updated.csv')
#Data Cleaning
#remove any invalid elements
df.dropna()
#Remove any columns that are irrelevant to our analysis such as invoice id, tax, n gross margin percentage
df.drop(['Invoice ID', 'Tax 5%', 'City', 'gross margin percentage'], axis = 1, inplace = True)
#We look at which is the most profitable branch
sales_by_branch = df.groupby('Branch')['Total'].sum()
branch_order = ['A', 'B', 'C']
sales_by_branch = sales_by_branch.reindex(branch_order)
sea.set_palette('pastel')
# Plotting the total sales for each branch
plt.figure(figsize=(10, 6), dpi = 300)
bar = sales_by_branch.plot(kind = 'bar', fontsize=14 , title = 'Total Sales by Branch')
plt.ylabel('Total Sales', fontsize=14, fontweight='bold')
plt.xlabel('Branch', fontsize=14, fontweight='bold')
plt.xticks()
plt.title('Total Sales by Branch', fontsize = 16, fontweight = 'bold')
#Add data labels on the bars:
for i in bar.patches:
bar.annotate(format(i.get_height(), '.0f'),
(i.get_x() + i.get_width() / 2., i.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points',
fontweight = 'semibold')
plt.tight_layout()
plt.show()
#Showing the average sales performance by city, shows that C has the least outliers, and greatest median so it should be the
#branch that we focus on
total_sales_by_branch = df.groupby('Branch')['Total'].sum()
plt.figure(figsize=(10, 6), dpi = 300)
box = sea.boxplot(x = 'Branch', y= 'Total', data = df, order = branch_order)
#Calculate and show the median values on the plot
for i, branch in enumerate(branch_order):
median_val = df[df['Branch'] == branch]['Total'].median()
median_val = round(median_val)
box.text(i, median_val * 1.05, f'Median: {median_val}', ha = 'center', size = 'small', color = 'black', weight = 'semibold')
plt.title('Sales Distribution by Branch', fontsize = 16, fontweight = 'bold')
plt.xlabel('Branch', fontsize=14)
plt.ylabel('Sales', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()
#I want to find out the best product line for Branch C, I will compare it via total sales
branch_c_data = df[df['Branch'] == 'C']
product_line_sales = branch_c_data.groupby('Product line')['Total'].sum()
plt.figure(figsize = (10, 6), dpi = 300)
#find the maximum value in the product line to explode it out the pie
max_product_line = product_line_sales.idxmax()
explode = []
for i in product_line_sales.index:
if i == max_product_line:
explode.append(0.1)
else:
explode.append(0)
plt.pie(product_line_sales, labels = product_line_sales.index, autopct='%1.1f%%', startangle= 140, explode = explode,
textprops={'fontsize': 14, 'fontweight': 'semibold'})
plt.title('Sales Distribution by Product Line in Branch C', fontsize=16, fontweight='semibold')
plt.tight_layout()
plt.show()
#I want to find out the best product line for Branch C, I will compare it via Gross Income
product_line_sales = branch_c_data.groupby('Product line')['gross income'].sum()
plt.figure(figsize=(10, 8), dpi=300)
# Use seaborn cuz i like the colors lol
sea.barplot(x=product_line_sales.index, y=product_line_sales.values)
palette = sea.color_palette("viridis", len(product_line_sales))
gross_bar = sea.barplot(x = product_line_sales.index, y = product_line_sales.values, palette = palette)
#Add data labels on the bars:
for i in gross_bar.patches:
gross_bar.annotate(format(i.get_height(), '.0f'),
(i.get_x() + i.get_width() / 2., i.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points',
fontweight = 'semibold')
plt.title('Gross Income Distribution by Product Line in Branch C', fontsize=16, fontweight='semibold')
plt.xlabel('Product Line', fontsize=14, fontweight='semibold')
plt.ylabel('Gross Income', fontsize=14, fontweight='semibold')
plt.xticks(rotation = 45)
plt.tight_layout()
plt.show()
# Calculate the average rating for each product line in Branch C
avg_rating_per_product_line = branch_c_data.groupby('Product line')['Rating'].mean().sort_values()
plt.figure(figsize=(10, 6))
bars = plt.barh(avg_rating_per_product_line.index, avg_rating_per_product_line.values, color = sea.color_palette("Spectral", len(avg_rating_per_product_line)))
#Create the color map on the side
sm = plt.cm.ScalarMappable(cmap = "Spectral", norm = plt.Normalize(vmin = min(avg_rating_per_product_line.values), vmax = max(avg_rating_per_product_line.values)))
sm.set_array([])
plt.colorbar(sm)
plt.title('Product Line Average Ratings', fontsize=16, fontweight = 'bold')
plt.xlabel('Ratings', fontsize=14, fontweight = 'bold')
plt.ylabel('Product Line', fontsize=14, fontweight = 'bold')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()
#I want to find the relationship between the price and the quantity purchased for each product line in branch C
# Get unique product lines
product_lines = branch_c_data['Product line'].unique()
plt.figure(figsize=(15, 20), dpi = 300)
marker_1 = ['o', 's', 'H', 'p', 'v', '*']
colors = sea.color_palette('hsv', len(product_lines))
# Create a scatter plot for each product line
for i, product_line in enumerate(product_lines, start = 1):
# Filter the data for the current product line
product_data = branch_c_data[branch_c_data['Product line'] == product_line]
# Calculate the R value
correlation = np.corrcoef(product_data['Unit price'], product_data['Quantity'])[0, 1]
# Calculate the best fit line
slope, intercept, r_value, p_value, std_err = stats.linregress(product_data['Unit price'], product_data['Quantity'])
# Create a subplot for each product line
plt.subplot(len(product_lines), 1, i)
plt.scatter(product_data['Unit price'], product_data['Quantity'], marker = marker_1[i - 1], color = colors[i - 1], label = f'{product_line} (R = {r_value:.2f})')
# Plot the best fit line
line = slope * product_data['Unit price'] + intercept
plt.plot(product_data['Unit price'], line, color = 'red')
plt.title(f'{product_line} - Unit Price vs Quantity Purchased', fontsize=16, fontweight = 'bold')
plt.xlabel('Unit Price ($)', fontsize = 14, fontweight = 'bold')
plt.ylabel('Quantity Purchased', fontsize = 14, fontweight = 'bold')
plt.legend(bbox_to_anchor=(1.05, 1), loc = 'upper left')
plt.tight_layout()
plt.show()
product_lines = branch_c_data.groupby('Unit price')['Quantity'].sum().reset_index()
slope, intercept, r_value, p_value, std_err = stats.linregress(product_lines['Unit price'], product_lines['Quantity'])
line = slope * product_lines['Unit price'] + intercept
plt.figure(figsize = (10, 6), dpi = 300)
plt.plot(product_lines['Unit price'], line, color='red')
plt.scatter(product_lines['Unit price'], product_lines['Quantity'], label= f'(R = {r_value:.2f})')
plt.title( 'Unit Price vs Quantity Purchased', fontsize = 16, fontweight = 'bold')
plt.xlabel('Unit Price ($)', fontsize = 14, fontweight = 'bold')
plt.ylabel('Quantity Purchased', fontsize = 14, fontweight = 'bold')
plt.legend(bbox_to_anchor=(1.05, 1), loc = 'upper left')
plt.grid(alpha = 0.4)
plt.show()
#I want to find out the intra day sales to see which time is the busiest
branch_c_data = df[df['Branch'] == 'C'].copy()
branch_c_data['Hour'] = pd.to_datetime(branch_c_data['Time'], format='%H:%M').dt.hour
hourly_counts = branch_c_data.groupby("Hour")['Total'].sum()
plt.figure(figsize=(10, 6), dpi = 300)
plt.bar(hourly_counts.index, hourly_counts.values, color = 'skyblue', alpha = 0.5, edgecolor = 'black')
plt.title('Intra-Day Sales Sum by Hours for Branch C', fontsize = 16, fontweight = 'bold')
plt.xlabel('Hours of the Day', fontsize = 14, fontweight = 'bold')
plt.ylabel('Sales ($)', fontsize = 14, fontweight = 'bold')
plt.xticks(range(0, 24), rotation = 0)
plt.grid(axis='y')
plt.tight_layout()
plt.show()
branch_c_data.head()
| Branch | Customer type | Gender | Product line | Unit price | Quantity | Total | Date | Time | Payment | cogs | gross income | Rating | Hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | C | Normal | Female | Electronic accessories | 15.28 | 5 | 80.220 | 3/8/2019 | 10:29 | Cash | 76.40 | 3.820 | 9.6 | 10 |
| 17 | C | Member | Female | Food and beverages | 99.42 | 4 | 417.564 | 2/6/2019 | 10:42 | Ewallet | 397.68 | 19.884 | 7.5 | 10 |
| 18 | C | Member | Female | Sports and travel | 68.12 | 1 | 71.526 | 1/7/2019 | 12:28 | Ewallet | 68.12 | 3.406 | 6.8 | 12 |
| 21 | C | Member | Male | Home and lifestyle | 56.11 | 2 | 117.831 | 2/2/2019 | 10:11 | Cash | 112.22 | 5.611 | 6.3 | 10 |
| 23 | C | Member | Female | Food and beverages | 98.70 | 8 | 829.080 | 3/4/2019 | 20:39 | Cash | 789.60 | 39.480 | 7.6 | 20 |
#I want to find out the intra day customer visits to see which time is the busiest
plt.figure(figsize=(10, 6), dpi = 300)
plt.hist(branch_c_data['Hour'], bins=range(0, 25), alpha=0.7, edgecolor='black')
plt.title('Intra-Day Sales Frequency by Hour for Branch C', fontsize = 16, fontweight = 'bold')
plt.xlabel('Hour of the Day', fontsize = 14, fontweight = 'bold')
plt.ylabel('Number of Transactions', fontsize = 14, fontweight = 'bold')
plt.xticks(range(0, 24), rotation = 0)
plt.grid(axis='y')
plt.tight_layout()
plt.show()
#I want to see which day of the week has the most sales
branch_c_data['Date'] = pd.to_datetime(branch_c_data['Date'])
# Create a new column for the day of the week
branch_c_data['Day of Week'] = branch_c_data['Date'].dt.day_name()
sales_by_day = branch_c_data.groupby('Day of Week')['Total'].sum()
# Sort the days
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
sales_by_day = sales_by_day.reindex(days_order)
# Plot the results
plt.figure(figsize=(10, 6), dpi = 300)
day_bar = sales_by_day.plot(kind='bar', color='skyblue', edgecolor='black')
for i in day_bar.patches:
day_bar.annotate(format(i.get_height(), '.0f'),
(i.get_x() + i.get_width() / 2., i.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points',
fontweight = 'semibold')
# Add titles and labels
plt.title('Sales by Day of Week for Branch C', fontsize=16, fontweight='bold')
plt.xlabel('Day of Week', fontsize=14, fontweight='bold')
plt.ylabel('Total Sales', fontsize=14, fontweight='bold')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()
# I want to see which products has the highest sale across the days through a heat map
sales_pivot = branch_c_data.pivot_table(index = 'Product line', columns='Day of Week', values='Total', aggfunc='sum')
# Reorder the columns
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
sales_pivot = sales_pivot[day_order]
# Plot the heatmap
plt.figure(figsize=(10, 8), dpi = 300)
sea.heatmap(sales_pivot, annot = True, fmt = ".0f",cmap = 'YlGnBu', linewidths = .5)
plt.title('Sales by Product Line and Day of the Week', fontsize = 16, fontweight = 'bold')
plt.ylabel('Product Line', fontsize = 14, fontweight = 'bold')
plt.xlabel('Day of the Week', fontsize = 14, fontweight = 'bold')
plt.show()
# I want to see which products has the highest quantity sold across the days through a heat map
sales_pivot = branch_c_data.pivot_table(index = 'Product line', columns = 'Day of Week', values = 'Quantity', aggfunc='sum')
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
sales_pivot = sales_pivot[day_order]
# Plot the heatmap
plt.figure(figsize=(10, 8), dpi = 300)
sea.heatmap(sales_pivot, annot = True, fmt =".0f",cmap = 'YlOrRd', linewidths = .5)
plt.title('Quantity Purchased by Product Line and Day of the Week', fontsize = 16, fontweight = 'bold')
plt.ylabel('Product Line', fontsize = 14, fontweight = 'bold')
plt.xlabel('Day of the Week', fontsize = 14, fontweight = 'bold')
plt.show()
#Finding out the weekly sales for the first 3 months
branch_c_data['Date'] = pd.to_datetime(branch_c_data['Date'])
# Extract the week number from the date
branch_c_data['Week'] = branch_c_data['Date'].dt.isocalendar().week
# Group by week and sum the total sales
weekly_sales = branch_c_data.groupby('Week')['Total'].sum()
# Plot the results
plt.figure(figsize=(12, 6), dpi = 300)
weekly_sales.plot(kind = 'line', marker = 'd', linestyle = '-')
plt.title('Weekly Sales Analysis', fontsize = 16, fontweight = 'bold')
plt.xlabel('Week of the Year', fontsize = 14, fontweight = 'bold')
plt.ylabel('Total Sales', fontsize = 14, fontweight = 'bold')
plt.grid(True)
plt.tight_layout()
plt.show()
#I want to see the Customer Type of Branch C
customer_types = branch_c_data['Customer type'].value_counts()
plt.figure(figsize = (10,6), dpi = 100)
plt.pie(customer_types, labels = customer_types.index, autopct = '%1.1f%%', startangle = 140,
textprops = {'fontsize': 14, 'fontweight': 'semibold'}, shadow = True)
plt.title("Percentage of Customer Types in Branch C", fontsize = 16, fontweight = 'bold')
plt.show()
#I want to see how much do members spend in comparison with normal customers
spending_cust_type = branch_c_data.groupby(['Customer type', 'Product line'])['Total'].sum().unstack()
ax = spending_cust_type.plot(kind = 'bar', figsize = (12, 8), width = 0.8, edgecolor = 'black')
for i in ax.patches:
ax.annotate(format(i.get_height(), '.0f'),
(i.get_x() + i.get_width() / 2., i.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points',
)
plt.title('Comparison of Spending by Customer Type and Product Line', fontsize = 16, fontweight = 'bold')
plt.xlabel('Product Line', fontsize = 14, fontweight = 'bold')
plt.ylabel('Total Spending', fontsize = 14, fontweight = 'bold')
plt.xticks(rotation=45)
plt.legend(title = 'Product Line')
plt.grid(axis='y')
plt.tight_layout()
plt.show()
#I want to see member and normal customer type and their preference to visit the supermarket on which day
# Group by 'Customer type' and 'Day of the Week'
customer_day_preference = branch_c_data.groupby(['Customer type', 'Day of Week']).size().unstack()
# Reorder the columns
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
customer_day_preference = customer_day_preference[days_order]
# Plot the bar chart
bro = customer_day_preference.plot(kind='bar', figsize=(12, 8), width=0.8, edgecolor = 'black')
for i in bro.patches:
bro.annotate(format(i.get_height(), '.0f'),
(i.get_x() + i.get_width() / 2., i.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points',
)
plt.title('Customer Visits by Day of Week and Customer Type', fontsize = 16, fontweight = 'bold')
plt.xlabel('Customer Type', fontsize = 14, fontweight = 'bold')
plt.ylabel('Number of Visits', fontsize = 14, fontweight = 'bold')
plt.xticks(rotation=45)
plt.legend(title='Day of the Week')
plt.grid(axis='y')
plt.tight_layout()
plt.show()
#Finding out whether member spends more than normal customer
plt.figure(figsize=(10, 6), dpi = 300)
box_2 = sea.boxplot(x = 'Customer type', y = 'Total', data = branch_c_data)
medians = branch_c_data.groupby(['Customer type'])['Total'].median()
proper_order = ['Normal', 'Member']
medians = medians.reindex(proper_order)
median_labels = [f'{median:.2f}' for median in medians]
pos = range(len(medians))
for i in pos:
box_2.text(pos[i], medians[i] * 1.05, median_labels[i],
ha = 'center', color = 'black', weight = 'semibold')
plt.title('Average Amount Spent by Customer Type', fontsize = 16, fontweight = 'bold')
plt.xlabel('Customer Type', fontsize = 14, fontweight = 'bold')
plt.ylabel('Average Amount Spent ($)', fontsize = 14, fontweight = 'bold')
plt.grid()
plt.show()
#I want to see the proportion of gender that goes to branch C
plt.figure(figsize = (10,6), dpi = 100)
gender_c = branch_c_data['Gender'].value_counts()
colors = ['lightblue', 'coral']
plt.pie(gender_c, labels = gender_c.index, autopct = '%1.1f%%', startangle = 65,
colors = colors, textprops = {'fontweight': 'semibold', 'fontsize': 12}, shadow = True)
plt.title("Gender Distribution Among Branch C Customers", fontsize = 16, fontweight = 'bold')
plt.show()
#I want to find the gender proportion in customer types
gender_type = branch_c_data.groupby(['Gender', 'Customer type']).size().unstack()
plt.figure(figsize = (12, 6), dpi = 300)
#we need to normalize the data for the proportion and not use absolute values
gender_type_normal = gender_type.div(gender_type.sum(axis = 1), axis = 0)
palette = sea.color_palette("crest")
gender_type_normal.plot(kind='bar', stacked = True, color = palette, width = 0.6)
plt.grid(alpha = 0.4)
plt.title('Membership Distribution Within Each Gender at Branch C', fontsize=16)
plt.xlabel('Gender', fontsize = 14, fontweight = 'bold')
plt.legend(title='Membership Status', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
<Figure size 3600x1800 with 0 Axes>
#I want to find out the total spending by both gender
total_spending_gender = branch_c_data.groupby('Gender')['Total'].sum()
plt.figure(figsize = (10, 6), dpi = 300)
total_spending_gender.plot(kind = 'bar', color = ['blue', 'orange'], edgecolor = 'black')
plt.title('Total Spending by Gender', fontsize = 16, fontweight = 'bold')
plt.xlabel('Gender', fontsize = 14, fontweight = 'bold')
plt.ylabel('Total Spending ($)', fontsize = 14, fontweight = 'bold')
# Adding value labels on top of the bars
for i, value in enumerate(total_spending_gender):
plt.text(i, value, f'{value:.0f}', ha = 'center', va = 'bottom')
plt.grid(alpha = 0.4)
plt.tight_layout()
plt.show()
#Since we know that females spend more, but I want to see which gender prefers what product line
# Calculate the average rating for each product line in Branch C q
gender_product_line = branch_c_data.groupby(['Product line', 'Gender'])['Total'].sum().unstack()
#plt.figure(figsize = (10, 6), dpi = 300)
plt.figure(figsize=(10, 6), dpi = 300)
# We will have two sets of bars, one for Female and one for Male
bar_width = 0.35 # the width of the bars
index = gender_product_line.index
# For female, we can use the positions from 0 to the number of product lines
female_positions = range(len(index))
# For male, we add the bar width to the female positions to move them to the right
male_positions = [x + bar_width for x in female_positions]
# Plot Female bars
plt.barh(male_positions, gender_product_line['Male'], height = bar_width, label= 'Male', color = 'royalblue', edgecolor = 'black')
# Plot Male bars
plt.barh(female_positions, gender_product_line['Female'], height=bar_width, label= 'Female', color = 'fuchsia', edgecolor = 'black')
# Set the y-ticks to be in the middle of the two sets of bars
plt.yticks([r + bar_width / 2 for r in female_positions], gender_product_line.index)
plt.title('Total Spending by Gender for Each Product Line', fontsize = 16, fontweight = 'bold')
plt.xlabel('Total Spending ($)', fontsize= 14, fontweight = 'bold')
plt.ylabel('Product Line', fontsize= 14, fontweight = 'bold')
plt.legend()
plt.tight_layout()
plt.show()
#I want to find out the relationship between product line and payment method
rs_product_payment = branch_c_data.groupby(['Product line', 'Payment']).size().unstack()
plt.figure(figsize = (10, 6), dpi = 300)
sea.heatmap(rs_product_payment, annot = True, cmap = 'YlGn', linewidths = .5)
# Setting the title and labels
plt.title('Frequency of Product Line Against Payment Method', fontsize = 16, fontweight = 'bold')
plt.xlabel('Payment Method', fontsize = 14, fontweight = 'bold')
plt.ylabel('Product Line', fontsize = 14, fontweight = 'bold')
# Show the plot
plt.tight_layout()
plt.show()
#I want to find out the relationship between payment method and sales
rs_product_payment = branch_c_data.groupby(['Product line', 'Payment'])['Total'].sum().unstack()
plt.figure(figsize = (10, 6), dpi = 300)
sea.heatmap(rs_product_payment, annot = True, fmt = '.0f', cmap = 'Blues', linewidths = .5)
# Setting the title and labels
plt.title('Total Sales of Product Line Against Payment Method', fontsize = 16, fontweight = 'bold')
plt.xlabel('Payment Method', fontsize = 14, fontweight = 'bold')
plt.ylabel('Product Line', fontsize = 14, fontweight = 'bold')
# Show the plot
plt.tight_layout()
plt.show()
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
daily_sales = branch_c_data.groupby('Date')['Total'].sum().reset_index()
#Using the Augmented Dickey-Fuller test is a hypothesis testing that determines if the given time series is
#stationary or not
result = adfuller(daily_sales['Total'])
print('p-value:', result[1])
#Since p-value is less than 0.05, it is considered stationary so (p,d,q) -> d is 0
#determine parameter p n q with pacf and acf respectively
#p is the number of autoregressive terms
#d is number of diffferences
#q is the number of moving average terms
model = ARIMA(daily_sales['Total'], order = (5, 0, 6))
model_fit = model.fit()
forecast = model_fit.get_forecast(steps = 100)
#print(daily_sales['Date'])
last_date = daily_sales['Date'].iloc[1]
#print(type(last_date))
forecast_index = pd.date_range(start = last_date, periods = 100)
forecast_values = forecast.predicted_mean
plt.figure(figsize=(12, 6), dpi = 300)
plt.plot(daily_sales['Date'], daily_sales['Total'], label = 'Observed')
plt.plot(forecast_index, forecast_values, label = 'Forecast')
plt.title('Forecasted Sales per day using ARIMA model', fontweight = 'bold', fontsize = 16)
plt.ylabel('Sales', fontweight = 'bold', fontsize = 14)
plt.xlabel('Date', fontweight = 'bold', fontsize = 14)
plt.legend()
plt.grid(alpha = 0.4)
plt.show()
p-value: 7.126034181616563e-19
C:\Users\Aaron\anaconda3\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
warn('Non-stationary starting autoregressive parameters'
C:\Users\Aaron\anaconda3\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
warn('Non-invertible starting MA parameters found.'
C:\Users\Aaron\anaconda3\lib\site-packages\statsmodels\base\model.py:604: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
warnings.warn("Maximum Likelihood optimization failed to "
#Check the acf n pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# Plot acf to find parameter q:
plt.figure(figsize=(14, 7), dpi = 300)
plot_acf(daily_sales['Total'], lags= 50, alpha = 0.05)
plt.title('Autocorrelation Function')
plt.show()
# Plot pacf to find parameter p
plt.figure(figsize=(14, 7))
plot_pacf(daily_sales['Total'], lags = 40, alpha = 0.05) # lags is the number of lags to show
plt.title('Partial Autocorrelation Function')
plt.show()
<Figure size 4200x2100 with 0 Axes>
C:\Users\Aaron\anaconda3\lib\site-packages\statsmodels\graphics\tsaplots.py:348: FutureWarning: The default method 'yw' can produce PACF values outside of the [-1,1] interval. After 0.13, the default will change tounadjusted Yule-Walker ('ywm'). You can use this method now by setting method='ywm'.
warnings.warn(
<Figure size 1008x504 with 0 Axes>